//Cluster detection routine - count based, with counting around focal plants
//
//This version: 15/09/2016


#include <iostream>

#include <gsl/gsl_sf_bessel.h>
#include <gsl/gsl_errno.h>
#include <gsl/gsl_math.h>
#include <gsl/gsl_poly.h>
#include <gsl/gsl_complex_math.h>
#include <gsl/gsl_sf.h>
#include <gsl/gsl_rng.h>
#include <gsl/gsl_randist.h>
#include <gsl/gsl_cdf.h>
#include <gsl/gsl_statistics_double.h>
#include <gsl/gsl_sort.h>


//UTILITY MACROS

#define DEG2RAD(x) (x * M_PI) / 180.0;
#define EARTH_RADIUS 6378.137

enum LOGIC { FALSE = 0 , TRUE = 1 };

//UTILITY FUNCTIONS


//Allocate array

double** alloc_array(int rows, int cols)
{
	double **array_to_alloc = new double*[rows];
	
	for (int i = 0; i < rows; i++)
	{
		array_to_alloc[i] = new double[cols]();
		
		if (array_to_alloc[i] == NULL)
		{
			fprintf(stderr, "ERROR - Memory could not be allocated, aborting\n");
			return(NULL);
		}
	}
	return(array_to_alloc);
}


double** free_array(int rows, double **ptr)
{
	for (int i = 0; i < rows; i++)
	{
		delete(ptr[i]);
	}
	
	delete(ptr);
	
	return(NULL);
}

double** alloc_large_array(long rows, long cols)
{
	double **array_to_alloc = new double*[rows];
	
	for (long i = 0; i < rows; i++)
	{
		array_to_alloc[i] = new double[cols]();
		
		if (array_to_alloc[i] == NULL)
		{
			fprintf(stderr, "ERROR - Memory could not be allocated, aborting\n");
			return(NULL);
		}
	}
	return(array_to_alloc);
}


double** free_large_array(long rows, double **ptr)
{
	for (long i = 0; i < rows; i++)
	{
		delete(ptr[i]);
	}
	
	delete(ptr);
	
	return(NULL);
}



int main (int argc, char *const argv[])
{
    //Parameters
    
    char ofile[1024];
    double ilat  = 0, ilon  = 0, jlat  = 0, jlon  = 0, distance = 0, cut, ipn, jpn;
    clock_t start, finish;
    int locsize, ctrsize;
    
    FILE *locdata;
    
    start = clock();
    
    
    if (argc < 3)
    {
        fprintf(stderr, "%s: ERROR - Not all arguments specified.\n", argv[0]);
        return(1);
    }
    
    
    //The location universe to be used is passed to the program as the first argument argv[1]
    
    //Structure of the file is as follows:
    //Line 1: Number of total_count control_count
    //Line x: scotts_id lat lon [naics,treat] other[employment]
    
    if ((locdata = fopen(argv[1], "r")) == NULL)
    {
        fprintf(stderr, "%s: ERROR - File %s not found, aborting.\n", argv[0], argv[1]);
        return(1);
    }
    
    //Output is written to the file specified as the second argument argv[2]
    strcpy(ofile, argv[2]);
    
    //Determine the cutoff distance to be processed from argv[3]
    if ((cut = atof(argv[3])) <= 0) {
        fprintf(stderr, "%s: ERROR - distance (in km) must be strictly positive\n", argv[0]);
        return(2);
    }
    
    //Location universe
    //Read in data for the universe of firm locations
    
    fscanf(locdata, "%d", &locsize); //Number of all locations
    fscanf(locdata, "%d", &ctrsize); //Number of control locations
    fprintf(stdout, "%s: Number of all locations to be processed = %d\n", argv[0], locsize);
    fprintf(stdout, "%s: Number of ctr locations to be processed = %d\n", argv[0], ctrsize);
    
    double **locspace = alloc_array(locsize, 6);
    double **empl_weights = alloc_array(locsize, 2);
    //Six things written to output - id, lat, lon, flag, p-val count, count of other firms with flag==1, count of other firms with flag==0
    
    int t = 0;
    
    while (fscanf(locdata, "%lf %lf %lf %lf %lf", &locspace[t][0], &locspace[t][1], &locspace[t][2], &locspace[t][3], &empl_weights[t][0]) != EOF)
    {
        //We first have to convert the latitude and the longitude into radians
        locspace[t][1] = DEG2RAD(locspace[t][1]);
        locspace[t][2] = DEG2RAD(locspace[t][2]);
        
        //Holds the output
        locspace[t][4] = 0.0;
        locspace[t][5] = 0.0;
        
        empl_weights[t][1] = 0.0;
        t++;
    }
    
    fclose(locdata);
    
    fprintf(stdout, "%s: Computing cluster p-values and counts with cutoff distance = %lf\n", argv[0], cut);
    
    
    //START GEOPROCESSING DATA
    
    fprintf(stdout, "%s: START\n", argv[0]);
    
    FILE *output = fopen(ofile, "w");
    
    fprintf(output, "id, lat, lon, treat, pval, own_count, other_count, other_W, own_W\n");
    
    //OUTER LOOP
    for (int i = 0; i < locsize; i++)
    {
        //CAN BE REMOVED IF EVERYTHING IS TO BE COMPUTE
        if (locspace[i][3] == 1)
        {
            
            ilat = locspace[i][1];
            ilon = locspace[i][2];
        
            //INNER LOOP
            for (int j = 0; j < locsize; j++)
            {
                
                if (i != j)
                {
                
                    jlat = locspace[j][1];
                    jlon = locspace[j][2];
                
                    if ((ilat != jlat) || (ilon != jlon))
                    {
                        //REPLACE WITH THE CORRESPONDING GSL FUNCTIONS IF NECESSARY...
                        distance = acos(sin(ilat) * sin(jlat) + cos(fabs(ilon - jlon)) * cos(ilat) * cos(jlat)) * EARTH_RADIUS;
                    
                        if (gsl_isnan(distance))
                        {
                            distance = 0.0;
                        }
                    }
                    else
                    {
                        distance = 0.0;
                    }
            
                    if (distance <= cut)
                    {
                        //Check if the plant belongs to treatment
                        ipn = locspace[i][3];
                        jpn = locspace[j][3];
                
                        if (jpn == ipn)
                        {
                            locspace[i][4]++;
                            empl_weights[i][1] += empl_weights[i][0];
                            
                        }
                        else
                        {
                            locspace[i][5]++;
                        }
                    }
            
                }
            }
        
            //Compute clustering p-values, based on the hypergeometric distribution (i.e., permutations)
        
            double p1 = gsl_cdf_hypergeometric_Q((unsigned int) locspace[i][4], (unsigned int) (locsize - ctrsize), (unsigned int) ctrsize, (unsigned int) (locspace[i][4] + locspace[i][5]));
            
            // If there is nothing around at all, replace the p=0.0 value with p=1.0 since clustering makes no sense in that case
            if ((locspace[i][4] == 0) && (locspace[i][5] == 0))
            {
                p1 = 1.0;
            }
            
            //double p2 = gsl_cdf_hypergeometric_Q((unsigned int) locspace[i][8], (unsigned int) locspace[i][6], (unsigned int) (locempl - locspace[i][6]), (unsigned int) (locspace[i][8] + locspace[i][10]));
        
            printf("%d\n", i);
            fprintf(output, "%lf,%lf,%lf,%lf,%lf,%lf,%lf,%lf,%lf\n", locspace[i][0], (locspace[i][1]*180)/M_PI, (locspace[i][2]*180)/M_PI, locspace[i][3], p1, locspace[i][4], locspace[i][5], empl_weights[i][1], empl_weights[i][0]);
        
        }
            
        if (i%100 == 0 && i != 0)
        {
            fprintf(stdout, "%s: Iteration %5d done\n", argv[0], i);
        }
    }
    
    finish = clock();
    
    fclose(output);
    
    locspace = free_array(locsize, locspace);
    
    fprintf(stdout, "%s: END\n", argv[0]);
    fprintf(stdout, "%s: Total time for geosmoothing data = %d seconds\n", argv[0], (int) ((finish - start) / CLOCKS_PER_SEC));
    
    return(0);
}
